Dataset used:
NYCflights14
Data URL:
https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv
Description:
This analysis uses data.table package to analyse flight departure and arrival delay for flights leaving 3 New York City airports in year 2014.
# load packages
library(data.table)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(knitr)
library(plotly)
library(rvest)
library(stringr)
library(parallel)
# load data set as nyc14
nyc14 = fread(
'https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv')
# compute mean departure delay by origin, carrier and month
avg.dep.delay = nyc14 %>%
.[, .(avg_dep_delay = mean(dep_delay)), by = .(carrier, month, origin)]
table1 = avg.dep.delay %>%
arrange(desc(avg_dep_delay))
knitr::kable(table1[1:10, ], digits = 1,
col.names = c('Carrier', 'Month', 'NYC Airport', 'Monthly Average Departure Delay'),
caption="Table 1: Top 10 Monthly Average Departure Delay by Carrier and Origin")
| Carrier | Month | NYC Airport | Monthly Average Departure Delay |
|---|---|---|---|
| HA | 2 | JFK | 52.9 |
| HA | 1 | JFK | 41.2 |
| F9 | 8 | LGA | 34.8 |
| F9 | 4 | LGA | 33.0 |
| WN | 5 | EWR | 32.4 |
| DL | 1 | JFK | 32.4 |
| B6 | 1 | JFK | 31.9 |
| F9 | 7 | LGA | 31.6 |
| EV | 1 | EWR | 31.5 |
| AA | 6 | EWR | 31.0 |
# spaghetti plot for each month, carrier and origin
figure1 = ggplot(data = avg.dep.delay,
aes(x = month, y = avg_dep_delay, group = origin, color = origin)) +
geom_line(size = 1) +
facet_wrap(~carrier) +
scale_x_continuous(breaks = round(seq(min(avg.dep.delay$month), max(avg.dep.delay$month),
by = 1),1)) +
labs(x = "Month", y = "Average departure delay",
title = "Figure 1: Average departure delay by month, origin and carrier") +
theme(axis.text.x = element_text(angle = 90))
figure1
From Table 1 and Figure 1, Hawaiian Airlines (HA) suffers from highest average departure delay during winter (January and February). This seasonal trend also exists for most of airlines, unfortunately the data does not contains November and December to further validate this claim. For airlines operating in multiple NYC airports, on average departure delay is most severe in Newark Airport than JFK and LGA. In general smaller airlines such as airTran (FL) and Frontier (F9) tend to have longer departure delay than large airlines such as Delta (DL) and United (UA).
# function to determine which period of day is the departure time
dept <- function(x) {
if (x <= 1159) return("0:00-11:59")
if (x >= 1200 & x <= 1759) return("12:00-17:59")
if (x >= 1800) return("18:00-23:59")
}
# compute average departure delay for each origin by time windows
time_dep_delay = nyc14 %>%
.[, "departure_time_window" := sapply(dep_time, dept)] %>%
.[, .("mean departure delay" = mean(dep_delay)),
by = .(origin, departure_time_window)] %>%
.[order(origin, departure_time_window)] %>%
spread(origin, 'mean departure delay')
knitr::kable(time_dep_delay, digits=2,
caption="Table 2: Average Departure Delay by time and airport (minutes)",
col.names=c('Departure Time Window', 'EWR', 'JFK', 'LGA'))
| Departure Time Window | EWR | JFK | LGA |
|---|---|---|---|
| 0:00-11:59 | 4.62 | 4.61 | 1.96 |
| 12:00-17:59 | 13.91 | 10.22 | 10.37 |
| 18:00-23:59 | 35.88 | 24.49 | 29.61 |
Departure delay can vary drastically by flight time, it is no surprise that morning and redeye flights has the lowest average delay compared to afternoon flights and evening flights. Table 2 confirms again that EWR has the most serious delay problem among the 3 NYC airports.
avg.arr.delay = nyc14 %>%
.[, .(avg_arr_delay = mean(arr_delay)), by = .(carrier, month, origin)]
figure2 = ggplot(data = avg.arr.delay,
aes(x = month, y = avg_arr_delay, group = origin, color = origin)) +
geom_line(size = 1) +
facet_wrap(~carrier) +
scale_x_continuous(breaks = round(seq(min(avg.dep.delay$month), max(avg.dep.delay$month),
by = 1),1)) +
labs(x = "Month", y = "Average arrival delay",
title = "Figure 2: Average arrival delay by month, origin and carrier")
figure2
On average, arrival delay for flight originating from NYC is slightly better than departure delay. The range (0 to 90) of vertical axis in Figure 2 may suggests the opposite as the range of vertical axis in Figure 1 is only from 0 to 50, however it is maybe due to the severe arrival delay of Hawaiian Airlines flight in February. Therefore, I decide to investigate severe arrival delay instead of average arrival delay. I define arrival delay to be the 90 percentile of arrival delay for all flights on the route. For example, 90 percentile of arrival delay for Delta flight from LGA to DTW is 34.0, that means 90% of Delta flights on that route will be arrival delay below 34 minutes. The following 3 graphs are heatmap of severe arrival delay for each route and carrier.
# compute 90th arrival depay by carrier, origin and destination
arr.delay.90 = nyc14 %>%
.[, .(arr_delay_90 = quantile(arr_delay, 0.9)), by = .(carrier, origin, dest)]
# seperate data set into different tables for each origin
arr.delay.jfk = arr.delay.90[origin == "JFK"]
arr.delay.lga = arr.delay.90[origin == "LGA"]
arr.delay.ewr = arr.delay.90[origin == "EWR"]
# heat map for each carrier and destinations based on 90th quantile of arrival delay
ggplot(data = arr.delay.jfk, aes(x = carrier, y = dest, fill = arr_delay_90)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, name = "90th percentile\narrival delay") +
labs(y = "Destination", x = "Carrier",
title = "Figure 3: 90 Percentile of Arrival Delay for flights departing from JFK")
ggplot(data = arr.delay.lga, aes(x = carrier, y = dest, fill = arr_delay_90)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, name = "90th percentile\narrival delay") +
labs(y = "Destination", x = "Carrier",
title = "Figure 4: 90 Percentile of Arrival Delay for flights departing from LGA")
ggplot(data = arr.delay.ewr, aes(x = carrier, y = dest, fill = arr_delay_90)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, name = "90th percentile\narrival delay") +
labs(y = "Destination", x = "Carrier",
title = "Figure 5: 90 Percentile of Arrival Delay for flights departing from EWR")
# function to determine delay category
delay.category = function(x) {
if (x <= 0) return("delay < 0 or = 0 min")
else if (x > 0 & x < 15) return("delay < 15 minutes")
else return("delay > 15 minutes")
}
# function to compute 95% confidence interval for mean
upper = function(x) {
return(mean(x) + 1.96 * sd(x)/sqrt(length(x)))
}
lower = function(x) {
return(mean(x) - 1.96 * sd(x)/sqrt(length(x)))
}
# Compute 95th confidence interval for mean relative air time
dep.delay = nyc14[, delay_category := sapply(dep_delay, delay.category)] %>%
.[, scale_air_time := (air_time - mean(air_time))/mean(air_time),
by = .(flight)] %>%
.[, .(mean = mean(scale_air_time)*100,
lowerci = lower(scale_air_time)*100,
upperci = upper(scale_air_time)*100),
by = delay_category] %>%
.[order(delay_category)]
# Display result
knitr::kable(dep.delay, digits=2, caption = 'Table 3: Average relative airtime and 95% confidence interval',
col.names = c('Delay Category', 'Average Relative Airtime(%)', 'lower bound(%)','upper bound(%)'))
| Delay Category | Average Relative Airtime(%) | lower bound(%) | upper bound(%) |
|---|---|---|---|
| delay < 0 or = 0 min | -1.08 | -1.21 | -0.94 |
| delay < 15 minutes | 2.27 | 1.98 | 2.57 |
| delay > 15 minutes | 1.19 | 0.94 | 1.44 |
# Extract the information we want from the resulting string
get_miles = function(txt){
y = str_split(txt,'\\(')[[1]]
z = str_split(y[2],' ')[[1]][1]
as.numeric(z)
}
## Encapsulate the above in a function to find the distance
## between two valid airport codes.
scrape_dist = function(a1, a2){
url = sprintf('https://www.world-airport-codes.com/distance/?a1=%s&a2=%s',
a1, a2)
srch = read_html(url)
txt =
srch %>%
html_node("strong") %>% # identified by viewing the source in a browser
html_text()
get_miles(txt)
}
# Load NYCflights14 data.
nyc14 = fread('https://github.com/arunsrinivasan/flights/wiki/NYCflights14/flights14.csv')
# unique codes for origins and destinations
orig_codes = unique(nyc14$origin)
dest_codes = unique(nyc14$dest)
airport_codes = c(orig_codes, dest_codes)
# call scrape_dist for a single fixed code vs a set of targets
get_dists = function(fixed, targets){
dists = sapply(targets, function(target) scrape_dist(fixed, target))
tibble(from=fixed, to=targets, dist=dists)
}
# Inner loop find distance between an origin to other origins and all destinations
inner_loop = function(i){
get_dists(orig_codes[i], airport_codes[{i+1}:length(airport_codes)])
}
df_dist = list()
for(i in 1:{length(orig_codes)}){
df_dist[[i]] = inner_loop(i)
}
# bind results of inner loop into a single data frame
df_dist = do.call(bind_rows, df_dist)
save(df_dist, file='./OriginsAirportDist.RData')
# load distances between origins and airports
load("OriginsAirportDist.RData")
df1 = data.table(df_dist)
# load distance between destination airports
load("AirportCodeDists.RData")
df2 = data.table(df_dist)
# merge both data.table into a single data.table
df3 = merge(df1, df2, all = TRUE)
# switch from and to columns in df3 to get a new data.table
df4 = data.table(from = df3$to, to = df3$from, dist = df3$dist)
# merge df3 and df4 to get pairwise distance between all airports
Dist = merge(df3, df4, all = T)
# convert Distance from long to wide format
Distance = dcast(Dist, from ~ to, value.var = "dist")
# delete 1st column of Distance (airports name)
Distance = Distance[, c("from") := NULL]
# convert to data.frame form as data.table does not support row names
Distance.df = as.data.frame(Distance)
rownames(Distance.df) = colnames(Distance.df)
# Convert all NA to 0
Distance.df[is.na(Distance.df)] <- 0
# Use multidimensional scaling on airports based on distances
airportMDS = cmdscale(Distance.df)
# store MDS coordinates as a data.table object
airportCoord = data.table(airport = rownames(airportMDS),
x = -1 * airportMDS[,1], y = airportMDS[, 2])
airportCoord = airportCoord %>%
mutate(airport_type = ifelse(airport %in% c('LGA', 'JFK', 'EWR'), 'NYC', 'Destination'))
save(airportCoord, file='./airportCoord.RData')
# produce two-dimensional map for 112 airports
figure6 = ggplot(data = airportCoord, aes(x = x, y = y, label = airport)) +
geom_point(aes(col = airport_type)) +
labs(x = "West <<<<<--->>>>> East", y = "South <<<<<--->>>>> North",
title = "Figure 6: Interactive Map of 112 airports recreated from Multi-dimensional Scaling") +
theme(axis.text.x=element_blank(), axis.text.y=element_blank())
ggplotly(figure6)
# number of weeks
weeks = (365-30-31)/7
# compute number of flights per week from each origin to each destination
weekly_flights = nyc14[, .("No_flights" = .N / weeks), by = .(origin, dest)]
# load dataset from problem 3
load("airportCoord.RData")
# function to find mds coordinates for airport
get_coord_x = function(target, data = airportCoord) {
return(airportCoord[airportCoord$airport == target, ]$x)
}
get_coord_y = function(target, data = airportCoord) {
return(airportCoord[airportCoord$airport == target, ]$y)
}
# generate data set
weekly_flights1 = weekly_flights[ , "origin_x" := sapply(origin, get_coord_x)] %>%
.[, "origin_y" := sapply(origin, get_coord_y)] %>%
.[, "dest_x" := sapply(dest, get_coord_x)] %>%
.[, "dest_y" := sapply(dest, get_coord_y)] %>%
.[, "weekly_flight_frequency" := No_flights ] %>%
arrange(desc(weekly_flight_frequency))
# display first 10 rows of result
knitr::kable(weekly_flights1[1:10, c(1,2,8)], digits=2, caption = "Table 4: Top 10 route by weekly flight frequency",
col.names = c('Origin', 'Destination', 'Weekly flight frequency'))
| Origin | Destination | Weekly flight frequency |
|---|---|---|
| JFK | LAX | 235.05 |
| JFK | SFO | 169.66 |
| LGA | ORD | 162.38 |
| LGA | ATL | 159.46 |
| LGA | MIA | 117.07 |
| EWR | SFO | 104.52 |
| JFK | MCO | 102.86 |
| EWR | BOS | 98.28 |
| EWR | LAX | 97.31 |
| EWR | ATL | 96.30 |
# network diagram using geom_curve
figure7 = ggplot(data = weekly_flights1, aes(x = dest_x, y = dest_y)) + geom_point() +
geom_curve(aes(x = origin_x, y = origin_y,
xend = dest_x, yend = dest_y,
size = weekly_flight_frequency,
color = origin),
arrow = arrow(length = unit(0.008, "npc")),
alpha = 1, curvature = 0.4) + scale_size(range = c(0, 2)) +
labs(x = "West <<<<<--->>>>> East", y = "South <<<<<--->>>>> North",
title = "Figure 7: Flights map with weekly frequency") +
theme(axis.text.x=element_blank(), axis.text.y=element_blank())
figure7
figure8 = ggplot(data = weekly_flights1, aes(x = dest_x, y = dest_y, Name = dest)) +
geom_point(color = 'skyblue', alpha = 0.7, aes(size = weekly_flight_frequency)) +
geom_point(aes(x = origin_x, y = origin_y),color = 'red') +
facet_wrap(~origin, dir = 'v') +
labs(x = "West <<<<<--->>>>> East", y = "South <<<<<--->>>>> North",
title = "Figure 8: Interactive map of flight route with weekly frequency") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_blank(), axis.ticks.y = element_blank())
ggplotly(figure8)